#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#Author: alvayang <alvayang@tabex.org>
#Last Change: 
#Description: 

import libxml2
import sys

text = file('test.data').read()
text = text.decode('utf-8').encode('gbk')

encoding = 'gbk'
#options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOWARNING + libxml2.HTML_PARSE_NOERROR
options = libxml2.HTML_PARSE_RECOVER
doc = libxml2.readDoc(text, None, encoding, options).doc
#doc = libxml2.htmlReadDoc(text, None, encoding, options).doc
ctxt = doc.xpathNewContext()
title = ctxt.xpathEval(u'//rss/sogouresult/item')
if title:
    want = [u'title', u'album', u'size', u'type', u'artist', u'urls']
    o = {}
    for z in title:
        for q in want:
            o[q] = (z.xpathEval(q)[0].get_content()).decode('utf-8') if z.xpathEval(u"title") else u'没有合理的解释' 
        print o
else:
    print >> sys.stderr, "No correct"

